knitr::opts_chunk$set(message = FALSE)
library(bslib)
library(dplyr)
library(DT)
library(ggplot2)
library(glue)
library(here)
library(lubridate)
library(plotly)
library(purrr)
library(readr)
library(rlang)
library(stringr)
library(tidyr)
theme_set(theme_bw())

input_dir <- params$input_dir # here("data")
aggregated_filetypes <- c("blamematrix", "catalog", "mimeo")
# TODO: only load last N weeks of data to keep RAM usage reasonably low
user_dat <- tibble(filename = list.dirs(input_dir) %>%
  Filter(function(x) {
    x != input_dir
  }, .) %>%
  lapply(function(x) {
    list.files(x, full.names = TRUE)
  }) %>%
  unlist()) %>%
  filter(!str_detect(filename, paste(aggregated_filetypes, collapse = "|"))) %>%
  separate_wider_delim(filename,
    delim = ".", cols_remove = FALSE,
    names = c("date", "path", "username", "file", "ext"),
    too_few = "debug"
  ) %>%
  filter(
    str_detect(ext, "tsv|txt"), # only keep tab-delimited files
    !str_detect(username, "[0-9]"), # filter out numeric usernames
    username != "allusers" # filter out the 'allusers' rows
  ) %>%
  mutate(date = as_date(basename(date)))
## Warning: Debug mode activated: adding variables `filename_ok`, `filename_pieces`,
## and `filename_remainder`.
dates <- user_dat %>%
  pull(date) %>%
  unique()
most_recent_date <- dates %>% max()
usernames <- user_dat %>%
  pull(username) %>%
  unique()

user_dat %>% write_tsv(here("results", glue("user-dat_{today()}.tsv")))

Most recent summary (2023-10-09)

Disk usage in /data/CCBR on Biowulf

summary_dat_recent <- user_dat %>%
  filter( # username %in% users_filter,
    date == most_recent_date, file == "summary"
  ) %>%
  pull(filename) %>%
  map(function(x) {
    read_tsv(x) %>% mutate(filename = x)
  }) %>%
  list_rbind() %>%
  separate_wider_delim(filename,
    delim = ".", cols_remove = FALSE,
    names = c("basepath", "path", "username", "file", "ext")
  ) %>%
  filter(FolderPath == "/data/CCBR")

summary_dat_recent %>% write_tsv(here("results", glue("summary-dat-recent_{today()}.tsv")))

summary_metrics <- summary_dat_recent %>%
  pivot_longer(where(is.numeric), names_to = "metric") %>%
  pull(metric) %>%
  unique()

top_users <- summary_dat_recent %>%
  pivot_longer(all_of(summary_metrics),
    names_to = "metric"
  ) %>%
  mutate(value_adj = case_when(
    str_detect(metric, "[sS]core") ~ -value,
    TRUE ~ value
  )) %>%
  group_by(metric) %>%
  slice_max(order_by = value_adj, n = 10) %>%
  pull(username) %>%
  unique()

plots <- summary_metrics %>% lapply(function(y_metric) {
  user_order <- summary_dat_recent %>%
    filter(username %in% top_users) %>%
    pivot_longer(where(is.numeric),
      names_to = "metric"
    ) %>%
    mutate(value_adj = case_when(
      str_detect(metric, "[sS]core") ~ -value,
      TRUE ~ value
    )) %>%
    filter(metric == y_metric) %>%
    arrange(by = value_adj) %>%
    pull(username)
  p <- summary_dat_recent %>%
    filter(username %in% top_users) %>%
    mutate(username = factor(username, levels = user_order)) %>%
    ggplot(aes(
      x = eval_tidy(data_sym(y_metric)),
      y = username,
      fill = eval_tidy(data_sym(y_metric)),
      text = glue("{username}\n{y_metric}\n{FolderPath}")
    )) +
    geom_col() +
    labs(x = y_metric, y = "") +
    theme(legend.position = "none")
  nav_panel(title = y_metric, card_header(y_metric), ggplotly(p, tooltip = "text"))
})
do.call(navset_pill_list, plots)
TotalBytes
DuplicateBytes
PercentDuplicateBytes
TotalFiles
DuplicateFiles
PercentDuplicateFiles
TotalMeanAge
DuplicateMeanAge
AgeScore
DupScore
OccScore
OverallScore

Summary over time

summary_dat_all <- user_dat %>%
  filter( # username %in% users_filter,
    file == "summary"
  ) %>%
  pull(filename) %>%
  map(function(x) {
    read_tsv(x) %>% mutate(filename = x)
  }) %>%
  list_rbind() %>%
  separate_wider_delim(filename,
    delim = ".", cols_remove = FALSE,
    names = c("basepath", "path", "username", "file", "ext")
  ) %>%
  mutate(date = str_replace(basepath, ".*/", "") %>% as_date()) %>%
  filter(FolderPath == "/data/CCBR") # TODO: repeat for /data/CCBR_Pipeliner

summary_dat_all %>% write_tsv(here("results", glue("summary-dat-all_{today()}.tsv")))

top_users <- summary_dat_all %>%
  pivot_longer(all_of(summary_metrics),
    names_to = "metric"
  ) %>%
  mutate(value_adj = case_when(
    str_detect(metric, "[sS]core") ~ -value,
    TRUE ~ value
  )) %>%
  group_by(metric) %>%
  slice_max(order_by = value_adj, n = 10) %>%
  pull(username) %>%
  unique()

plots <- summary_metrics %>% lapply(function(y_metric) {
  user_order <- summary_dat_all %>%
    filter(username %in% top_users) %>%
    pivot_longer(all_of(summary_metrics),
      names_to = "metric"
    ) %>%
    mutate(value_adj = case_when(
      str_detect(metric, "[sS]core") ~ -value,
      TRUE ~ value
    )) %>%
    filter(metric == y_metric) %>%
    arrange(by = value_adj) %>%
    pull(username)
  p <- summary_dat_all %>%
    filter(username %in% user_order) %>%
    ggplot(aes(date, eval_tidy(data_sym(y_metric)),
      color = username
    )) +
    geom_line(alpha = 0.7) +
    geom_point(aes(text = glue("{username}\n{y_metric}\n{FolderPath}\n{date}"))) +
    labs(y = y_metric)
  nav_panel(title = y_metric, card_header(y_metric), ggplotly(p, tooltip = "text"))
})
## Warning in geom_point(aes(text = glue("{username}\n{y_metric}\n{FolderPath}\n{date}"))): Ignoring unknown aesthetics: text
## Ignoring unknown aesthetics: text
## Ignoring unknown aesthetics: text
## Ignoring unknown aesthetics: text
## Ignoring unknown aesthetics: text
## Ignoring unknown aesthetics: text
## Ignoring unknown aesthetics: text
## Ignoring unknown aesthetics: text
## Ignoring unknown aesthetics: text
## Ignoring unknown aesthetics: text
## Ignoring unknown aesthetics: text
## Ignoring unknown aesthetics: text
do.call(navset_pill_list, plots)
TotalBytes
DuplicateBytes
PercentDuplicateBytes
TotalFiles
DuplicateFiles
PercentDuplicateFiles
TotalMeanAge
DuplicateMeanAge
AgeScore
DupScore
OccScore
OverallScore

High value duplicate files (2023-10-09)

grub_dat <- list.files(
  here(
    input_dir,
    format(most_recent_date, format = "%Y%m%d")
  ),
  full.names = TRUE
) %>%
  Filter(function(x) str_detect(x, "_data_CCBR\\..*\\.grubbers\\.tsv"), .) %>%
  map(function(x) {
    read_tsv(x, col_names = FALSE) %>%
      mutate(filename = x)
  }) %>%
  list_rbind() %>%
  rename(
    file_hash = X1,
    file_count = X2,
    total_disk_usage = X3,
    single_disk_usage = X4,
    filepaths = X5
  ) %>%
  separate_wider_delim(filename,
    delim = ".", cols_remove = FALSE,
    names = c("date", "path", "username", "file", "ext")
  ) %>%
  mutate(date = as_date(basename(date))) %>%
  filter(
    !str_detect(username, "[0-9]"), # filter out numeric usernames
    username != "allusers" # filter out the 'allusers' rows
  ) %>%
  separate_wider_delim(total_disk_usage,
    delim = " ",
    names = c("total_disk_usage_value", "total_disk_usage_unit"),
    cols_remove = FALSE
  ) %>%
  separate_wider_delim(single_disk_usage,
    delim = " ",
    names = c("single_disk_usage_value", "single_disk_usage_unit"),
    cols_remove = FALSE
  ) %>%
  mutate(across(all_of(c("total_disk_usage_value", "single_disk_usage_value")), as.numeric))

grub_dat %>% write_tsv(here("results", glue("grub-dat_{today()}.tsv")))

top_files <- grub_dat %>%
  arrange(order_by = desc(total_disk_usage_value)) %>%
  select(total_disk_usage_value, username, filepaths) %>%
  rename(disk_usage_gb = total_disk_usage_value)

card(card_header("Top files"), datatable(top_files, fillContainer = TRUE))
Top files